import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from collections import Counter
from sklearn import preprocessing
import sys
data=pd.read_excel('/users/sarafarhat/Desktop/data.xlsx')
data.head()
| age | sex | cp | trestbps | chol | fbs | restecg | thalach | exang | oldpeak | slope | ca | thal | target | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 63 | 1 | 3 | 145 | 233 | 1 | 0 | 150 | 0 | 2.3 | 0 | 0 | 1 | 1 |
| 1 | 37 | 1 | 2 | 130 | 250 | 0 | 1 | 187 | 0 | 3.5 | 0 | 0 | 2 | 1 |
| 2 | 41 | 0 | 1 | 130 | 204 | 0 | 0 | 172 | 0 | 1.4 | 2 | 0 | 2 | 1 |
| 3 | 56 | 1 | 1 | 120 | 236 | 0 | 1 | 178 | 0 | 0.8 | 2 | 0 | 2 | 1 |
| 4 | 57 | 0 | 0 | 120 | 354 | 0 | 1 | 163 | 1 | 0.6 | 2 | 0 | 2 | 1 |
conda install -c conda-forge pandas-profiling
Collecting package metadata (current_repodata.json): done Solving environment: done # All requested packages already installed. Note: you may need to restart the kernel to use updated packages.
import pandas_profiling as pp
pp.ProfileReport(data)
data.describe()
| age | sex | cp | trestbps | chol | fbs | restecg | thalach | exang | oldpeak | slope | ca | thal | target | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 303.000000 | 303.000000 | 303.000000 | 303.000000 | 303.000000 | 303.000000 | 303.000000 | 303.000000 | 303.000000 | 303.000000 | 303.000000 | 303.000000 | 303.000000 | 303.000000 |
| mean | 54.366337 | 0.683168 | 0.966997 | 131.623762 | 246.264026 | 0.148515 | 0.528053 | 149.646865 | 0.326733 | 1.039604 | 1.399340 | 0.729373 | 2.313531 | 0.544554 |
| std | 9.082101 | 0.466011 | 1.032052 | 17.538143 | 51.830751 | 0.356198 | 0.525860 | 22.905161 | 0.469794 | 1.161075 | 0.616226 | 1.022606 | 0.612277 | 0.498835 |
| min | 29.000000 | 0.000000 | 0.000000 | 94.000000 | 126.000000 | 0.000000 | 0.000000 | 71.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 47.500000 | 0.000000 | 0.000000 | 120.000000 | 211.000000 | 0.000000 | 0.000000 | 133.500000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 2.000000 | 0.000000 |
| 50% | 55.000000 | 1.000000 | 1.000000 | 130.000000 | 240.000000 | 0.000000 | 1.000000 | 153.000000 | 0.000000 | 0.800000 | 1.000000 | 0.000000 | 2.000000 | 1.000000 |
| 75% | 61.000000 | 1.000000 | 2.000000 | 140.000000 | 274.500000 | 0.000000 | 1.000000 | 166.000000 | 1.000000 | 1.600000 | 2.000000 | 1.000000 | 3.000000 | 1.000000 |
| max | 77.000000 | 1.000000 | 3.000000 | 200.000000 | 564.000000 | 1.000000 | 2.000000 | 202.000000 | 1.000000 | 6.200000 | 2.000000 | 4.000000 | 3.000000 | 1.000000 |
data.isna().sum()
age 0 sex 0 cp 0 trestbps 0 chol 0 fbs 0 restecg 0 thalach 0 exang 0 oldpeak 0 slope 0 ca 0 thal 0 target 0 dtype: int64
data.shape
(303, 14)
data.value_counts()
age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal target
38 1 2 138 175 0 1 173 0 0.0 2 4 2 1 2
77 1 0 125 304 0 0 162 1 0.0 2 3 2 0 1
51 0 2 130 256 0 0 149 0 0.5 2 0 2 1 1
140 308 0 0 142 0 1.5 2 1 2 1 1
1 0 140 261 0 0 186 1 0.0 2 0 2 1 1
..
59 1 0 138 271 0 0 182 0 0.0 2 0 2 1 1
140 177 0 1 162 1 0.0 2 1 3 0 1
164 176 1 0 90 0 1.0 1 2 1 0 1
170 326 0 0 140 1 3.4 0 0 3 0 1
29 1 1 130 204 0 0 202 0 0.0 2 0 2 1 1
Length: 302, dtype: int64
print(data.duplicated().sum())
1
print(data[data.duplicated()])
age sex cp trestbps chol fbs restecg thalach exang oldpeak \
164 38 1 2 138 175 0 1 173 0 0.0
slope ca thal target
164 2 4 2 1
data.drop_duplicates(inplace=True)
print(data.duplicated().sum())
0
data.skew()
age -0.203743 sex -0.786120 cp 0.493022 trestbps 0.716541 chol 1.147332 fbs 1.981201 restecg 0.169467 thalach -0.532671 exang 0.737281 oldpeak 1.266173 slope -0.503247 ca 1.295738 thal -0.481232 target -0.173691 dtype: float64
import plotly as py
import plotly.express as px
import plotly.graph_objs as go
from collections import Counter
import ipywidgets as widgets
from scipy import special
py.offline.init_notebook_mode(connected=True)
data['cp'] = np.where(data['cp']==0, 'typical angina', np.where(data['cp']==1, 'atypical angina',
np.where(data['cp']==2, 'non-anginal pain', np.where(data['cp']==3, 'asymptomatic', 0))))
def pie(df, x, y):
count = Counter(df)
count = pd.DataFrame({x:count.keys(), y:count.values()})
fig = px.pie(count, x, y)
fig.update_layout(legend_title=dict(text=x),title=dict(text=x))
py.offline.iplot(fig)
data['target']=np.where(data['target']==1,'High Risk for CVD','Low Risk for CVD')
pie(data['target'], 'Target','Number of Patients')
pie(data['cp'], 'Chest pain type', 'Number of patients')
freq_sex=data.sex.value_counts()
freq_sex
1 206 0 96 Name: sex, dtype: int64
plt.figure(figsize = (12,5))
freq_sex.plot.pie(labels=['Male','Female'],radius = 1.2,
autopct = '%1.2f %%',
explode = [0.01, 0.02], # specifies the distance of the wedge from the center of the pie
textprops = {'size' : 12, 'color' : 'steelblue'},
wedgeprops = {'edgecolor' : 'white','width' :0.65 },
cmap = 'Set3',
shadow = True
)
plt.ylabel('')
plt.title('Sex of Patients\n', size = 30, color = 'Purple', weight = 'bold')
plt.legend(['Male','Female'],loc=1)
plt.show()
data['fbs'] = np.where(data['fbs']==1, 'FBS>120mg/dl', 'FBS<120mg/dl')
pie(data['fbs'], 'Fasting Blood Sugar > 120 mg/dl', 'Number of patients')
data['restecg'] = np.where(data['restecg']==0, 'normal',np.where(data['restecg']==1, 'ST-T wave abnormality',np.where(data['restecg']==2,'LV hypertrophy',0)))
pie(data['restecg'], 'Resting electrocardiographic results', 'Number of patients')
data['exang'] = np.where(data['exang']==1, 'yes', 'no')
pie(data['exang'], 'Exercise Induced Angina', 'Number of patients')
pie(data['ca'], 'Number of Major Vessels Colored by Flouroscopy', 'Number of patients')
data['slope']=np.where(data['slope']==0,'Upward Slope',np.where(data['slope']==1,'Flat Slope','Downward Slope'))
pie(data['slope'], 'Slope of the Peak Exercise ST Segment', 'Number of patients')
pie(data['thal'], 'Type of Thalassemia', 'Number of Patients')
data['target']=np.where(data['target']==1,'High Risk for CVD','Low Risk for CVD')
px.treemap(data, path=['age'], color='target')
fig=plt.figure(figsize=(14, 4))
fig = plt.subplot(122)
sns.histplot(data[data['target'] == 0]["trestbps"], color='blue',label='Lower Chance of Heart Attack',kde=True,bins=40)
sns.histplot(data[data['target'] == 1]["trestbps"], color='orange',label='Higher Chance of Heart Attack',kde=True, bins=40)
plt.title('Impact of Resting Blood Pressure on Heart Attack Risk ', fontsize=17)
plt.legend()
<matplotlib.legend.Legend at 0x137fdcd00>
freq_sex=data.sex.value_counts()
freq_sex
1 207 0 96 Name: sex, dtype: int64
data['sex'] = np.where(data['sex']==1, 'male', 'female')
px.sunburst(data, path=['sex'], color='target')
sns.catplot(x='sex', y='age',kind='bar',data=data)
<seaborn.axisgrid.FacetGrid at 0x133684160>
px.treemap(data, path=['sex','cp'],color='target' )
sns.barplot(x='sex',y='trestbps',data=data)
<AxesSubplot:xlabel='sex', ylabel='trestbps'>
px.treemap(data, path=['sex','trestbps'],color='target' )
px.treemap(data, path=['sex','fbs'],color='target' )
px.treemap(data, path=['sex','chol'],color='target' )
def barplot(data, x, y, hue):
fig, ax = plt.subplots(1, 1, figsize=(15, 10))
sns.barplot(data=data, x=x, y=y, hue=hue)
plt.show()
data['target']=np.where(data['target']==1,'High risk of CVD','Low risk of CVD')
cols = [['sex', 'chol', 'target']]
for i in cols:
barplot(data=data, x=i[0], y=i[1], hue=i[2])
px.treemap(data,path=['sex','restecg'],color='target')
px.treemap(data,path=['sex','exang'],color='target')
px.treemap(data,path=['sex','slope'],color='target')
px.treemap(data,path=['sex','ca'],color='target')
data['thal']=np.where(data['thal']==1,'Thal Minor', np.where(data['thal']==2, 'Thal Intermedia', np.where(data['thal']==3, 'Thal Major','No Thal')))
px.treemap(data,path=['sex','thal'],color='target')
fig=plt.figure(figsize=(14, 4))
fig = plt.subplot(122)
sns.histplot(data[data['target'] == 0]["chol"], color='pink',label='Lower Chance of Heart Attack',kde=True,bins=40)
sns.histplot(data[data['target'] == 1]["chol"], color='purple',label='Higher Chance of Heart Attack',kde=True, bins=40)
plt.title('Impact of Total Cholesterol Levels on Heart Attack Risk ', fontsize=17)
plt.legend()
<matplotlib.legend.Legend at 0x135569be0>
sns.catplot(x='slope', y='target',kind='bar',data=data)
<seaborn.axisgrid.FacetGrid at 0x1356dbeb0>
sns.scatterplot(data=data, x="thalach", y="age", hue="target")
<AxesSubplot:xlabel='thalach', ylabel='age'>
sns.scatterplot(data=data, x="oldpeak", y="age", hue="target")
<AxesSubplot:xlabel='oldpeak', ylabel='age'>
sns.scatterplot(data=data, x="trestbps", y="age", hue="target")
<AxesSubplot:xlabel='trestbps', ylabel='age'>
sns.pairplot(data[['sex','cp','fbs','restecg','exang','slope','ca','oldpeak','chol', 'age', 'trestbps', 'thalach','thal','target']], hue='target')
<seaborn.axisgrid.PairGrid at 0x1359bee50>
# create a copy of the dataset
data_new=data
cat_cols = ['sex','exang','ca','cp','fbs','restecg','slope','thal']
con_cols = ['age','trestbps','chol','thalach','oldpeak']
# encode the categ columns by creating dummy variables
data_new=pd.get_dummies(data_new,columns=cat_cols, drop_first=True)
# select input features and output
X=data_new.drop(['target'],axis=1)
y=data_new[['target']]
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score,recall_score,precision_score
# Split data for training and testing
# I will take 80% of the data for training, and test on the remaining data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)
print("The shape of X_train is ", X_train.shape)
print("The shape of X_test is ",X_test.shape)
print("The shape of y_train is ",y_train.shape)
print("The shape of y_test is ",y_test.shape)
The shape of X_train is (242, 22) The shape of X_test is (61, 22) The shape of y_train is (242, 1) The shape of y_test is (61, 1)
# scale the continuous features to treat for possible outliers
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
X[con_cols] = scaler.fit_transform(X[con_cols])
print("The head for X is")
X.head()
The head for X is
| age | trestbps | chol | thalach | oldpeak | sex_male | exang_yes | ca_1 | ca_2 | ca_3 | ... | cp_2 | cp_3 | fbs_1 | restecg_ST-T wave abnormality | restecg_normal | slope_Flat Slope | slope_Upward Slope | thal_1 | thal_2 | thal_3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.592593 | 0.75 | -0.110236 | -0.092308 | 0.9375 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 1 | 0 | 0 |
| 1 | -1.333333 | 0.00 | 0.157480 | 1.046154 | 1.6875 | 1 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 |
| 2 | -1.037037 | 0.00 | -0.566929 | 0.584615 | 0.3750 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
| 3 | 0.074074 | -0.50 | -0.062992 | 0.769231 | 0.0000 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4 | 0.148148 | -0.50 | 1.795276 | 0.307692 | -0.1250 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
5 rows × 22 columns
model = LogisticRegression()
model.fit(X_train,np.ravel(y_train))
LogisticRegression()
y_pred = model.predict(X_test) # finding predicted value
cm = confusion_matrix(y_test, y_pred)
plt.title('Heatmap of Confusion Matrix', fontsize = 18)
sns.heatmap(cm, annot = True)
plt.show()
accuracy_model=accuracy_score(y_test, y_pred)
print(accuracy_model)
0.9016393442622951
f1_model=f1_score(y_test, y_pred,average=None)
print(f1_model)
[0.90322581 0.9 ]
recall_model=recall_score(y_test, y_pred, average=None)
print(recall_model)
[0.875 0.93103448]
precision_model=precision_score(y_test, y_pred, average=None)
print(precision_model)
[0.93333333 0.87096774]
cl_report=classification_report(y_test,y_pred)
print(cl_report)
precision recall f1-score support
High Risk for CVD 0.93 0.88 0.90 32
Low Risk for CVD 0.87 0.93 0.90 29
accuracy 0.90 61
macro avg 0.90 0.90 0.90 61
weighted avg 0.90 0.90 0.90 61